package org.solrmarc.index.extractor.formatter; import java.text.Normalizer; import java.text.Normalizer.Form; import java.util.Collection; import java.util.Collections; import java.util.EnumSet; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import org.marc4j.marc.DataField; import org.marc4j.marc.VariableField; import org.solrmarc.index.extractor.ExternalMethod; import org.solrmarc.index.mapping.AbstractMultiValueMapping; import org.solrmarc.tools.DataUtil; import org.solrmarc.tools.Utils; public class FieldFormatterBase implements FieldFormatter { String indicatorFmt = null; Map<String, String> sfCodeMap = null; String fieldFormat = null; String separator = null; List<AbstractMultiValueMapping> maps = null; // boolean unique = false; eJoinVal joinVal = eJoinVal.SEPARATE; int substringStart = -1; int substringEnd = -1; EnumSet<eCleanVal> cleanVal = EnumSet.noneOf(eCleanVal.class); String fieldTagFmt = null; public FieldFormatterBase(boolean clean) { if (clean) { cleanVal.add(eCleanVal.CLEAN_EACH); cleanVal.add(eCleanVal.CLEAN_END); } } public FieldFormatterBase(EnumSet<eCleanVal> cleanVal) { this.cleanVal = cleanVal; } public FieldFormatterBase(FieldFormatterBase toClone) { this.indicatorFmt = toClone.indicatorFmt; this.sfCodeMap = toClone.sfCodeMap; this.separator = toClone.separator; this.joinVal = toClone.joinVal; this.substringStart = toClone.substringStart; this.substringEnd = toClone.substringEnd; this.cleanVal = toClone.cleanVal; if (toClone.maps != null) { if (this.maps == null) this.maps = new LinkedList<>(); for (AbstractMultiValueMapping map : toClone.maps) { if (map instanceof ExternalMethod && !((ExternalMethod)map).isThreadSafe()) { this.maps.add((AbstractMultiValueMapping) ((ExternalMethod)map).makeThreadSafeCopy()); } else { this.maps.add(map); } } } } /* * (non-Javadoc) * * @see playground.solrmarc.index.fieldmatch.FieldFormatter#getFieldTagFmt() */ @Override public String getFieldTagFmt() { return fieldTagFmt; } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#setFieldTagFmt(java. * lang.String) */ @Override public FieldFormatter setFieldTagFmt(String fieldTagFmt) { this.fieldTagFmt = fieldTagFmt; return(this); } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#getIndicatorFmt() */ @Override public String getIndicatorFmt() { return indicatorFmt; } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#setIndicatorFmt(java. * lang.String) */ @Override public FieldFormatter setIndicatorFmt(String indicatorFmt) { this.indicatorFmt = indicatorFmt; return(this); } // /* // * (non-Javadoc) // * // * @see playground.solrmarc.index.fieldmatch.FieldFormatter#getSfCodeFmt() // */ // @Override // public String getSfCodeFmt(char sfCode) // { // if (sfCodeMap != null && sfCodeMap.containsKey(sfCode)) // { // return(sfCodeMap.get(sfCode)); // } // return(null); // } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#setSfCodeFmt(java. * lang.String) */ @Override public FieldFormatter setSfCodeFmt(String[] mapParts) { if (sfCodeMap == null) sfCodeMap = new LinkedHashMap<String, String>(); for (String part : mapParts) { String[] pieces = part.split("=>", 2); if (pieces.length == 2 && pieces[0].length() == 1) { sfCodeMap.put(pieces[0], pieces[1]); } else if (fieldFormat == null && !part.equals("format")) { fieldFormat = part; } } return(this); } @Override public String getFieldFormat() { return fieldFormat; } @Override public boolean hasFieldFormat() { return (fieldFormat != null); } /* * (non-Javadoc) * * @see playground.solrmarc.index.fieldmatch.FieldFormatter#getSeparator() */ @Override public String getSeparator() { return separator; } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#setSeparator(java. * lang.String) */ @Override public FieldFormatter setSeparator(String separator) { this.separator = separator; return(this); } /* * (non-Javadoc) * * @see playground.solrmarc.index.fieldmatch.FieldFormatter#getCleanVal() */ @Override public EnumSet<eCleanVal> getCleanVal() { return cleanVal; } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#setCleanVal(java.util * .EnumSet) */ @Override public FieldFormatter setCleanVal(EnumSet<eCleanVal> cleanVal) { this.cleanVal = cleanVal; return(this); } /* * (non-Javadoc) * * @see playground.solrmarc.index.fieldmatch.FieldFormatter#addCleanVal( * playground.solrmarc.index.fieldmatch.FieldFormatterBase.eCleanVal) */ @Override public FieldFormatter addCleanVal(eCleanVal cleanVal) { this.cleanVal.add(cleanVal); return(this); } @Override public eJoinVal getJoinVal() { return joinVal; } @Override public FieldFormatter setJoinVal(eJoinVal joinVal) { this.joinVal = joinVal; return(this); } @Override public FieldFormatter setSubstring(int offset, int endOffset) { this.substringStart = offset; this.substringEnd = endOffset; return(this); } @Override public FieldFormatter addMap(AbstractMultiValueMapping valueMapping) { if (maps == null) maps = new LinkedList<>(); maps.add(valueMapping); return(this); } /* * (non-Javadoc) * * @see playground.solrmarc.index.fieldmatch.FieldFormatter#start(java.lang. * StringBuilder) */ @Override public StringBuilder start() { return new StringBuilder(); // return (makeResult()); } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#addTag(org.marc4j. * marc.VariableField, java.lang.StringBuilder) */ @Override public void addTag(StringBuilder sb, VariableField df) { if (fieldFormat != null && fieldFormat.contains("%tag")) { sbReplace(sb, "%tag", df.getTag()); } else if (fieldTagFmt != null) { sb.append(fieldTagFmt.contains("%tag") ? fieldTagFmt.replaceAll("%tag", df.getTag()) : df.getTag()); } } private void sbReplace(StringBuilder sb, String pattern, String value) { int indexOf = sb.indexOf(pattern); if (indexOf != -1) { sb.replace(indexOf, indexOf + pattern.length(), value); } } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#addIndicators(org. * marc4j.marc.VariableField, java.lang.StringBuilder) */ @Override public void addIndicators(StringBuilder sb, VariableField df) { if (fieldFormat != null && (fieldFormat.contains("%1") || fieldFormat.contains("%2"))) { sbReplace(sb, "%1", ""+((DataField) df).getIndicator1()); sbReplace(sb, "%2", ""+((DataField) df).getIndicator2()); } else if (indicatorFmt != null && df instanceof DataField) { String result = indicatorFmt.replaceAll("%1", "" + ((DataField) df).getIndicator1()).replaceAll("%2", "" + ((DataField) df).getIndicator1()); sb.append(result); } } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#addCode(java.lang. * String, java.lang.StringBuilder) */ @Override public void addCode(StringBuilder sb, String codeStr) { // if (sfCodeFmt != null) // { // buffer.append(sfCodeFmt.replaceAll("%sf", codeStr)); // } } @Override public Collection<String> handleMapping(Collection<String> cleaned) throws Exception { if (maps == null) return (cleaned); Collection<String> mapped = cleaned; for (AbstractMultiValueMapping map : maps) { mapped = map.map(mapped); } return(mapped); } @Override public String handleSubFieldFormat(String sfCode, String mappedDataVal) { if (sfCodeMap == null || !sfCodeMap.containsKey(sfCode)) return (mappedDataVal); String value = sfCodeMap.get(sfCode); value = value.replace("$"+sfCode, mappedDataVal); return(value); } private final String getSubstring(final String data) { try { if (substringStart != -1) { if (substringEnd != -1) return data.substring(substringStart, substringEnd); else return data.substring(substringStart); } else { return(data); } } catch (IndexOutOfBoundsException ioobe) { return(""); } } private static Pattern ACCENTS = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); public String cleanData(VariableField vf, boolean isSubfieldA, String data) { final EnumSet<eCleanVal> cleanVal = getCleanVal(); int numToDel = 0; String trimmed = data; if (cleanVal.contains(eCleanVal.STRIP_INDICATOR_2) && isSubfieldA && vf instanceof DataField) { DataField df = (DataField) vf; char ind2Val = df.getIndicator2(); numToDel = (ind2Val >= '0' && ind2Val <= '9') ? ind2Val - '0' : 0; if (numToDel > 0) trimmed = trimmed.substring(numToDel); } trimmed = cleanVal.contains(eCleanVal.UNTRIMMED) ? getSubstring(trimmed) : getSubstring(trimmed).trim(); String str = (cleanVal.contains(eCleanVal.CLEAN_EACH)) ? DataUtil.cleanData(trimmed) : trimmed; if (!cleanVal.contains(eCleanVal.STRIP_ACCCENTS) && !cleanVal.contains(eCleanVal.STRIP_ALL_PUNCT) && !cleanVal.contains(eCleanVal.TO_LOWER) && !cleanVal.contains(eCleanVal.TO_UPPER) && !cleanVal.contains(eCleanVal.TO_TITLECASE) && !cleanVal.contains(eCleanVal.STRIP_INDICATOR_2)) { return (str); } // Do more extensive cleaning of data. if (cleanVal.contains(eCleanVal.STRIP_ACCCENTS)) { str = ACCENTS.matcher(Normalizer.normalize(str, Form.NFD)).replaceAll(""); StringBuilder folded = new StringBuilder(); boolean replaced = false; for (char c : str.toCharArray()) { char newc = Utils.foldDiacriticLatinChar(c); if (newc != 0x00) { folded.append(newc); replaced = true; } else { folded.append(c); } } if (replaced) str = folded.toString(); } if (cleanVal.contains(eCleanVal.STRIP_ALL_PUNCT)) str = str.replaceAll("( |\\p{Punct})+", " "); if (!cleanVal.contains(eCleanVal.UNTRIMMED)) str = str.trim(); if (cleanVal.contains(eCleanVal.TO_LOWER)) { str = str.toLowerCase(); } else if (cleanVal.contains(eCleanVal.TO_UPPER)) { str = str.toUpperCase(); } else if (cleanVal.contains(eCleanVal.TO_TITLECASE)) { str = DataUtil.toTitleCase(str); } return str; } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#addVal(java.lang. * String, java.lang.StringBuilder) */ @Override public void addVal(StringBuilder sb, String sfcode, String data) { if (fieldFormat != null && sfcode != null) { sbReplace(sb, "$"+sfcode, data); } else { sb.append(data); } } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#addSeparator(int, * java.lang.StringBuilder) */ @Override public void addSeparator(StringBuilder sb, int cnt) { if (fieldFormat != null) { // if formatting field ignore "separate" } else if (joinVal == eJoinVal.JOIN && getSeparator() != null) { if (cnt != 0) sb.append(getSeparator()); } } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#addAfterSubfield(java * .util.Collection, java.lang.StringBuilder) */ @Override public void addAfterSubfield(StringBuilder sb, Collection<String> result) { if (fieldFormat != null) { // if formatting field ignore "separate" } else if (joinVal == eJoinVal.SEPARATE) { if (sb.length() == 0) return; final String field = (this.getCleanVal().contains(eCleanVal.CLEAN_END)) ? DataUtil.cleanData(sb.toString()) : sb.toString(); if (field.length() > 0) result.add(field); sb.setLength(0); } } /* * (non-Javadoc) * * @see * playground.solrmarc.index.fieldmatch.FieldFormatter#addAfterField(java. * util.Collection, java.lang.StringBuilder) */ @Override public void addAfterField(StringBuilder sb, Collection<String> result) { if (fieldFormat != null) { String fieldVal = sb.toString().replaceAll("\\$[a-z0-9]", ""); if (fieldVal.length() == 0) return; final String field = (this.getCleanVal().contains(eCleanVal.CLEAN_END)) ? DataUtil.cleanData(fieldVal) : fieldVal; if (field.length() > 0) result.add(field); sb.setLength(0); } else if (joinVal == eJoinVal.JOIN) { if (sb.length() == 0) return; final String field = (this.getCleanVal().contains(eCleanVal.CLEAN_END)) ? DataUtil.cleanData(sb.toString()) : sb.toString(); if (field.length() > 0) result.add(field); sb.setLength(0); } } // @Override // public Collection<String> makeResult() // { // Collection<String> result; // if (unique) // { // result = new LinkedHashSet<String>(); // } // else // { // result = new ArrayList<String>(); // } // return result; // } @Override public Collection<String> prepData(VariableField vf, boolean isSubfieldA, String data) throws Exception { final String cleaned = cleanData(vf, isSubfieldA, data); @SuppressWarnings("unchecked") final List<String> cleanedDataAsList = (cleaned == null || cleaned.length() == 0) ? Collections.EMPTY_LIST : Collections.singletonList(cleaned); Collection<String> result = handleMapping(cleanedDataAsList); return (result); } @Override public boolean isThreadSafe() { if (maps == null) return(true); for (AbstractMultiValueMapping map : maps) { if (map instanceof ExternalMethod && !((ExternalMethod)map).isThreadSafe()) { return(false); } } return(true); } @Override public Object makeThreadSafeCopy() { return new FieldFormatterBase(this); } }